plot_expr = function(coord_1 = dmap$DC1, coord_2 = dmap$DC3, gene_name, labels = c("DC1", "DC3"), sce = sub_sce){
plot_df = data.frame(X = coord_1, Y = coord_2, expr = as.numeric(logcounts(sce[match(gene_name, genes[,2]),])))
plot_df = plot_df[order(plot_df$expr, decreasing = FALSE),]
p = ggplot(plot_df, aes(x = X, y=Y, col = expr)) +
geom_point() +
scale_color_gradient2(name = paste0(gene_name, "\nlog2\ncounts"), mid = "cornflowerblue", low = "gray75", high = "black", midpoint = max(plot_df$expr)/2) +
labs(x = labels[1], y= labels[2])
return(p)
}
library(Matrix)
library(scran)
library(Rtsne)
library(irlba)
library(igraph)
library(reshape2)
library(scales)
library(ggrepel)
library(destiny)
library(viridis)
library(scater)
library(matrixStats)
# library(velocyto.R)
library(FateID)
library(knitr)
library(RColorBrewer)
library(pheatmap)
library(biomaRt)
source("/nfs/research1/marioni/jonny/embryos/scripts/core_functions.R")
load_data(remove_doublets = TRUE, remove_stripped = TRUE, load_corrected = TRUE)
#set it up for scran to be properly parallelised
library(BiocParallel)
ncores = 1
mcparam = SnowParam(workers = ncores)
register(mcparam)
no_ticks = theme(axis.ticks = element_blank(), axis.text = element_blank())
no_title = no_ticks + theme(axis.title = element_blank())
In this script we will perform some analysis on a single germ layer of the embryo: the endoderm. Endodermal cells are highlighted in the manuscript Figure 1 UMAP in Figure 1 in this document. A notable feature is the contiguous nature of the extraembryonic (ExE) Visceral endoderm and the embryonic Definitive endoderm, via the “Gut” annotated cells.
umap = read.table("/nfs/research1/marioni/jonny/embryos/scripts/vis/umap/umap.tab", header = FALSE, sep = "\t")
new_cols = as.character(meta$celltype)
new_cols[!new_cols %in% c("Anterior Primitive Streak", "Def. endoderm", "Gut", "Visceral endoderm", "ExE endoderm")] = "Other"
umap_global = ggplot(umap, aes(x= V1, y = V2, col = factor(new_cols, levels = c(names(celltype_colours), "Other"), ordered = TRUE))) +
scale_color_manual(values = c(celltype_colours, "Other" = "darkgrey"), name = "") +
geom_point(size = 0.2) +
theme(axis.title = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank()) +
guides(col = guide_legend(override.aes = list(size = 5)))
umap_global
Figure 1: Endoderm clusters are highlighted on the Figure 1 t-SNE
Work from the Hadjantonakis lab ( Kwon et al. ) showed that visceral endoderm cells do contribute to the embryonic gut - combined with the UMAP shown above, this suggests that we can molecularly dissect this apparently convergent process of endoderm formation.
We now select the appropriate cell types for our analysis, as highlighted above. There is one exception, however: the exclusion of two subclusters of definitive endoderm cells that shows notochordal properties. This cluster and its expression of the Noto gene is shown in Figure 2.
expr = as.numeric(logcounts(sce[match("Noto", genes[,2]),]))
order = order(expr)
p1 = ggplot(umap[order,], aes(x= V1, y = V2, col = expr[order])) +
scale_color_gradient2(name = "Noto\nlog2\ncounts", mid = "cornflowerblue", low = "gray75", high = "black", midpoint = max(expr)/2) +
geom_point(size = 0.2) +
theme(axis.title = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank())
sub_meta = meta[meta$cluster == 9 & meta$cluster.sub %in% c(1:3), ]
label = sapply(sub_meta$cluster.sub, switch,
"1" = "Notochord",
"2" = "Retained DE",
"3" = "Excluded DE")
p2 = ggplot(mapping = aes(x = factor(label), y = expr[match(sub_meta$cell, meta$cell)], fill = label)) +
geom_boxplot() +
labs(y = "Noto log2 counts") +
theme(axis.title.x = element_blank(),
legend.position = "none") +
scale_fill_manual(values = c("Notochord" = celltype_colours["Notochord"],
"Retained DE" = celltype_colours["Def. endoderm"],
"Excluded DE" = "cornflowerblue"))
p = plot_grid(p1, p2, ncol = 1, rel_heights = c(0.6, 0.4))
p
Figure 2: Expression of Noto gene shown on UMAP and with respect to the excluded DE cells
keep = meta$cluster == 11 | ( meta$cluster == 9 & !meta$cluster.sub %in% c(1,3) )
sub_sce = normalize(sce[, keep])
sub_meta = meta[keep, ]
sub_corrected = corrected$all[keep,]
write.table(sub_corrected, file = "/nfs/research1/marioni/jonny/embryos/scripts/endoderm/test.tab", quote = FALSE, row.names = FALSE, col.names = FALSE)
save(sub_sce, sub_meta, file = "/nfs/research1/marioni/jonny/embryos/scripts/endoderm/beginning_data.RData")
# load("/nfs/research1/marioni/jonny/embryos/scripts/endoderm/beginning_data.RData")
This leaves us with the cells highlighted in the umap in Figure 3.
# new_cols = as.character(meta$celltype)
# new_cols[!new_cols %in% c("Anterior Primitive Streak", "Def. endoderm", "Gut", "Visceral endoderm", "ExE endoderm")] = "Other"
new_cols[meta$cluster == 9 & meta$cluster.sub %in%c(1,3)] = "Other"
umap_global = ggplot(umap, aes(x= V1, y = V2, col = factor(new_cols, levels = c(names(celltype_colours), "Other"), ordered = TRUE))) +
scale_color_manual(values = c(celltype_colours, "Other" = "darkgrey"), name = "") +
geom_point(size = 0.2) +
theme(axis.title = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank()) +
guides(col = guide_legend(override.aes = list(size = 5)))
umap_global
Figure 3: Endoderm clusters are highlighted on the Figure 1 t-SNE
ggsave(umap_global,
file = "/nfs/research1/marioni/jonny/embryos/scripts/endoderm/plots/umap_global.pdf",
width = 9, height = 6)
We now visualise these cells in a number of different ways. We first perform a batch correction procedure using only these subsetted cells. This batch correction is identical to that considered for our global visualisation for Figure 1 of the manuscript. We use these batch correction coordinates for the following plots.
hvgs = getHVGs(sub_sce)
#get order: oldest to youngest; most cells to least cells
order_df = sub_meta[!duplicated(sub_meta$sample), c("stage", "sample")]
order_df$ncells = sapply(order_df$sample, function(x) sum(sub_meta$sample == x))
order_df$stage = factor(order_df$stage,
levels = rev(c("E8.5",
"E8.25",
"E8.0",
"E7.75",
"E7.5",
"E7.25",
"mixed_gastrulation",
"E7.0",
"E6.75",
"E6.5")))
order_df = order_df[order(order_df$stage, order_df$ncells, decreasing = TRUE),]
order_df$stage = as.character(order_df$stage)
set.seed(42)
corrected = doBatchCorrect(counts = logcounts(sub_sce)[rownames(sub_sce) %in% hvgs,],
timepoints = sub_meta$stage,
samples = sub_meta$sample,
timepoint_order = order_df$stage,
sample_order = order_df$sample,
npc = 50)
The cells are visualised by t-SNE in Figure 4 coloured by stage, and 5 coloured by celltype
set.seed(42)
tsne = Rtsne(corrected, pca = FALSE, perplexity = 60)$Y
no = sample(nrow(corrected), nrow(corrected))
ggplot(mapping = aes(x = tsne[no, 1], y= tsne[no, 2], col = sub_meta$stage[no])) +
geom_point() +
scale_color_manual(values = stage_colours, name= "Stage", labels = stage_labels) +
theme(panel.background = element_rect(fill = "grey80")) +
labs(x = "tSNE1", y= "tSNE2")
Figure 4: Cell and their collection timepoints visualised by t-SNE
The first fifty principal components were selected for this visualisation.
ggplot(mapping = aes(x = tsne[no, 1], y= tsne[no, 2], col = factor(sub_meta$celltype[no]))) +
geom_point() +
scale_color_manual(values = celltype_colours, name= "Celltype") +
labs(x = "tSNE1", y= "tSNE2")
Figure 5: Cells and their annotated celltype visualised by t-SNE
The first fifty principal components were selected for this visualisation.
The cells and their celltypes are visualised in Figure 6.
write.table(corrected, file = "/nfs/research1/marioni/jonny/embryos/scripts/endoderm/corrected.tab", quote = FALSE, row.names = FALSE, col.names = FALSE)
shell = "/nfs/research1/marioni/jonny/embryos/embryos2017.simg"
script = "/nfs/research1/marioni/jonny/embryos/scripts/endoderm/umap.py"
#this was run separately
# system(paste("singularity exec", shell, "python3", script))
umap_endo = read.table("/nfs/research1/marioni/jonny/embryos/scripts/endoderm/umap.tab")
ggplot(mapping = aes(x = umap_endo[no, 1], y= umap_endo[no, 2], col = sub_meta$celltype[no])) +
geom_point() +
scale_color_manual(values = celltype_colours, name= "Celltype") +
labs(x = "UMAP1", y= "UMAP2")
Figure 6: Cells projected via UMAP, labelled by celltype
The cells’ transcriptomes are visualised using the batch-corrected PCA in Figure 7). We can see that PC1 separates cells based on their origin tissue (i.e. embryonic/extraembryonic) while PC2 separates by stage.
p1 = ggplot(mapping = aes(x = corrected[no, 1], y= corrected[no, 2], col = sub_meta$stage[no])) +
geom_point() +
scale_color_manual(values = stage_colours, name= "Stage", labels = stage_labels) +
theme(panel.background = element_rect(fill = "grey80")) +
labs(x = "Corrected PC1", y= "Corrected PC2")
p2 = ggplot(mapping = aes(x = corrected[no, 1], y= corrected[no, 2], col = sub_meta$celltype[no])) +
geom_point() +
scale_color_manual(values = celltype_colours, name= "Celltype") +
labs(x = "Corrected PC1", y= "Corrected PC2")
plot_grid(p1, p2, ncol = 1)
Figure 7: PCA of considered cells
PC2 separates cells according to their developmental stage. PC1 separates Visceral and Definitive endoderm, as shown by expression of Ttr, Apoe, and Bmp7.
Finally, we have visualised the cells using Diffusion Maps in Figure 8, which was itself calculated from the batch-corrected PCA above.
dmap = DiffusionMap(corrected)
dmap_stage = ggplot(mapping = aes(x = dmap$DC1[no], y= dmap$DC2[no], col = sub_meta$stage[no])) +
geom_point(size = 0.7) +
scale_color_manual(values = stage_colours, name = "Stage", stage_labels) +
labs(x = "DC1", y = "DC2")
dmap_celltype = ggplot(mapping = aes(x = dmap$DC1[no], y= dmap$DC2[no], col = sub_meta$celltype[no])) +
geom_point(size = 0.7) +
scale_color_manual(values = celltype_colours, name = "Celltype") +
labs(x = "DC1", y = "DC2") +
guides(colour = guide_legend(override.aes = list(size = 5)))
dmap_celltype_2 = ggplot(mapping = aes(x = dmap$DC1[no], y= dmap$DC3[no], col = sub_meta$celltype[no])) +
geom_point(size = 0.7) +
scale_color_manual(values = celltype_colours, name = "Celltype") +
labs(x = "DC1", y = "DC3")
dmap_stage_2 = ggplot(mapping = aes(x = dmap$DC1[no], y= dmap$DC3[no], col = sub_meta$stage[no])) +
geom_point(size = 0.7) +
scale_color_manual(values = stage_colours, name = "Stage", stage_labels) +
labs(x = "DC1", y = "DC3") +
guides(colour = guide_legend(override.aes = list(size = 5)))
plot_grid(dmap_stage, dmap_celltype, dmap_stage_2, dmap_celltype_2, ncol =2, rel_widths = c(0.45, 0.55, 0.45, 0.55))
Figure 8: Diffusion map embeddings of the data are shown
We can now being to investigate patterns of gene expression in the data (Figure 9). Two tips can be accounted for by our origin tissues - with Ttr marking the visceral endoderm, and Mixl1 marking cells emerging from the primitive streak.
dmap_ttr = plot_expr(dmap$DC1, dmap$DC2, "Ttr")
dmap_mixl = plot_expr(dmap$DC1, dmap$DC2, "Mixl1")
plot_grid(dmap_ttr, dmap_mixl, ncol = 2, labels = "AUTO")
Figure 9: Gene expression patterns that mark the early embryonic tissues
Moreover, inspection of diffusion component 3 reveals some of the specialisation of later gut tissues: Cdx2 marks the hindgut, Nepn the midgut, Pyy the foregut, and Nkx2-5 the pharyngeal endoderm. We will return to these tissues in greater detail later in this document.
dmap_cdx = plot_expr(dmap$DC1, dmap$DC3, "Cdx2", labels = c("DC1", "DC3"))
dmap_nepn = plot_expr(dmap$DC1, dmap$DC3, "Nepn", labels = c("DC1", "DC3"))
dmap_pyy = plot_expr(dmap$DC1, dmap$DC3, "Pyy", labels = c("DC1", "DC3"))
dmap_nkx = plot_expr(dmap$DC1, dmap$DC3, "Nkx2-5", labels = c("DC1", "DC3"))
plot_grid(dmap_cdx, dmap_nepn, dmap_pyy, dmap_nkx)
Figure 10: Gene expression patterns that mark the tissues of the embryonic gut
Finally, we also show the expression of a set of genes that were considered in a recent publication on endoderm formation and intercalation (Viotti et al. 2014) in Figure 11.
afp_plot = plot_expr(dmap$DC1, dmap$DC2, "Afp")
sox_plot = plot_expr(dmap$DC1, dmap$DC2, "Sox17")
foxa_plot = plot_expr(dmap$DC1, dmap$DC2, "Foxa2")
cdh_plot = plot_expr(dmap$DC1, dmap$DC2, "Cdh1")
plot_grid(afp_plot, sox_plot, foxa_plot, cdh_plot, ncol = 2, labels = "AUTO")
Figure 11: Genes considered in Viotti et al 2014
Afp was used to mark VE cells; Sox17 and Foxa2 were used to mark DE cells that were preparing to egress or were actively egressing; Cdh1 was observed in cells that either had egressed or had aborted an egression attempt.
ggsave(dmap_stage + no_ticks,
file = "/nfs/research1/marioni/jonny/embryos/scripts/endoderm/plots/dmap_stage.pdf",
width = 6, height = 4)
ggsave(dmap_celltype + no_ticks,
file = "/nfs/research1/marioni/jonny/embryos/scripts/endoderm/plots/dmap_celltype.pdf",
width = 6, height = 4)
ggsave(dmap_ttr + no_ticks + ggtitle("Ttr") + theme(plot.title = element_text(face = "bold.italic")),
file = "/nfs/research1/marioni/jonny/embryos/scripts/endoderm/plots/dmap_ttr.pdf",
width = 6, height = 4)
ggsave(dmap_mixl + no_ticks + ggtitle("Mixl1") + theme(plot.title = element_text(face = "bold.italic")),
file = "/nfs/research1/marioni/jonny/embryos/scripts/endoderm/plots/dmap_mixl.pdf",
width = 6, height = 4)
ggsave(dmap_celltype_2 + no_ticks,
file = "/nfs/research1/marioni/jonny/embryos/scripts/endoderm/plots/dmap_celltype_dc3.pdf",
width = 6, height = 4)
ggsave(dmap_stage_2 + no_ticks,
file = "/nfs/research1/marioni/jonny/embryos/scripts/endoderm/plots/dmap_stage_dc3.pdf",
width = 6, height = 4)
ggsave(dmap_cdx + no_ticks + ggtitle("Cdx2") + theme(plot.title = element_text(face = "bold.italic")),
file = "/nfs/research1/marioni/jonny/embryos/scripts/endoderm/plots/dmap_cdx.pdf",
width = 6, height = 4)
ggsave(dmap_nepn + no_ticks + ggtitle("Nepn") + theme(plot.title = element_text(face = "bold.italic")),
file = "/nfs/research1/marioni/jonny/embryos/scripts/endoderm/plots/dmap_nepn.pdf",
width = 6, height = 4)
ggsave(dmap_nkx + no_ticks + ggtitle("Nkx2-5") + theme(plot.title = element_text(face = "bold.italic")),
file = "/nfs/research1/marioni/jonny/embryos/scripts/endoderm/plots/dmap_nkx.pdf",
width = 6, height = 4)
ggsave(dmap_pyy + no_ticks + ggtitle("Pyy") + theme(plot.title = element_text(face = "bold.italic")),
file = "/nfs/research1/marioni/jonny/embryos/scripts/endoderm/plots/dmap_pyy.pdf",
width = 6, height = 4)
In order to summarise this information into two dimensions, we have also generated a force-directed layout of a graph built from the diffusion embedding. Specifically, the graph is a KNN structure, connecting each cell to its ten nearest neighbours in the first fifteen diffusion components. The celltype and stage information for these cells is shown in Figure 12.
graph_df = read.table("/nfs/research1/marioni/jonny/embryos/scripts/endoderm/gephi.tab", header = TRUE, sep = "\t")
graph_celltype = ggplot(mapping = aes(x = graph_df$gephiX[no], y= graph_df$gephiY[no], col = sub_meta$celltype[no])) +
geom_point(size = 0.7) +
scale_color_manual(values = celltype_colours, name = "Celltype") +
labs(x = "DC1", y = "DC2") +
guides(colour = guide_legend(override.aes = list(size = 5))) +
theme(axis.text = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank())
graph_stage = ggplot(mapping = aes(x = graph_df$gephiX[no], y= graph_df$gephiY[no], col = sub_meta$stage[no])) +
geom_point(size = 0.7) +
scale_color_manual(values = stage_colours, labels = stage_labels, name = "Timepoint") +
theme(axis.text = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank())
plot_grid(graph_celltype, graph_stage)
Figure 12: Force-directed layout of a K-nearest neighbour graph built from the first 15 diffusion components
Marker gene expression is shown in Figure 13.
theme = theme(axis.text = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank())
graph_ttr = plot_expr(graph_df$gephiX, graph_df$gephiY, "Ttr") + theme
graph_mixl = plot_expr(graph_df$gephiX, graph_df$gephiY, "Mixl1") + theme
graph_cdx = plot_expr(graph_df$gephiX, graph_df$gephiY, "Cdx2") + theme
graph_nepn = plot_expr(graph_df$gephiX, graph_df$gephiY, "Nepn") + theme
graph_pyy = plot_expr(graph_df$gephiX, graph_df$gephiY, "Pyy") + theme
graph_nkx = plot_expr(graph_df$gephiX, graph_df$gephiY, "Nkx2-5") + theme
plot_grid(graph_ttr, graph_mixl, graph_cdx, graph_nepn, graph_pyy, graph_nkx, nrow = 3)
Figure 13: Marker gene expression on the force-directed layout
ggsave(graph_ttr + no_title + ggtitle("Ttr") + theme(plot.title = element_text(face = "bold.italic")),
file = "/nfs/research1/marioni/jonny/embryos/scripts/endoderm/plots/graph_ttr.pdf",
width = 6, height = 4)
ggsave(graph_mixl + no_title + ggtitle("Mixl1") + theme(plot.title = element_text(face = "bold.italic")),
file = "/nfs/research1/marioni/jonny/embryos/scripts/endoderm/plots/graph_mixl.pdf",
width = 6, height = 4)
ggsave(graph_celltype + no_title,
file = "/nfs/research1/marioni/jonny/embryos/scripts/endoderm/plots/graph_celltype.pdf",
width = 6, height = 4)
ggsave(graph_stage + no_title,
file = "/nfs/research1/marioni/jonny/embryos/scripts/endoderm/plots/graph_stage.pdf",
width = 6, height = 4)
ggsave(graph_cdx + no_title + ggtitle("Cdx2") + theme(plot.title = element_text(face = "bold.italic")),
file = "/nfs/research1/marioni/jonny/embryos/scripts/endoderm/plots/graph_cdx.pdf",
width = 6, height = 4)
ggsave(graph_nepn + no_title + ggtitle("Nepn") + theme(plot.title = element_text(face = "bold.italic")),
file = "/nfs/research1/marioni/jonny/embryos/scripts/endoderm/plots/graph_nepn.pdf",
width = 6, height = 4)
ggsave(graph_nkx + no_title + ggtitle("Nkx2-5") + theme(plot.title = element_text(face = "bold.italic")),
file = "/nfs/research1/marioni/jonny/embryos/scripts/endoderm/plots/graph_nkx.pdf",
width = 6, height = 4)
ggsave(graph_pyy + no_title + ggtitle("Pyy") + theme(plot.title = element_text(face = "bold.italic")),
file = "/nfs/research1/marioni/jonny/embryos/scripts/endoderm/plots/graph_pyy.pdf",
width = 6, height = 4)
To define the structure of the cell transcrtopes of the embryonic gut, we now classify the gut cells using the data in our two latest timepoints: E8.5 and E8.25. These cells are highlighted in Figure 14.
keep = sub_meta$stage %in% c("E8.5", "E8.25") & sub_meta$celltype == "Gut"
qplot(dmap$DC1[order(keep)], dmap$DC3[order(keep)], col = keep[order(keep)]) +
geom_point() +
scale_colour_manual(values = c("TRUE" = "red", "FALSE" = "darkgrey"),
labels = c("TRUE" = "E8.5 & E8.25", "FALSE" = "Other"),
name = "") +
labs(x = "DC1", y= "DC3")
Figure 14: Cells selected for gut clustering
late_sce = sub_sce[,keep]
late_corrected = corrected[keep,]
late_meta = sub_meta[keep,]
late_dmap = DiffusionMap(late_corrected)
These cells are projected via diffusion map in Figure 15. This diffusion map was calculated from subsetted PC coordinates from the batch-correction performed above. Clusters are highlighted by colour; clusters were determined using Louvain clustering (igraph::cluster_louvain) on a shared nearest neighbour graph (scran::buildSNNGraph) both with default parameters. Tables of the top marker genes for each cluster are shown in Table 1.
set.seed(42)
graph = buildSNNGraph(late_corrected, d= NA, transposed = TRUE)
late_clusts = as.numeric(membership(cluster_louvain(graph)))
p1 = qplot(late_dmap$DC1, late_dmap$DC2, col = factor(late_clusts)) +
scale_colour_Publication(name = "Cluster") +
labs(x = "DC1", y = "DC2")
ro = sample(nrow(late_meta), nrow(late_meta))
p2 = qplot(late_dmap$DC1[ro], late_dmap$DC2[ro], col = factor(late_meta$stage)[ro]) +
scale_colour_Publication(name = "Stage") +
labs(x = "DC1", y = "DC2")
plot_grid(p1, p2)
Figure 15: Gut cells from the final two timepoints are shown, coloured by their newly-calculated cluster and their collection timepoint
markers = findMarkers(late_sce, clusters = factor(late_clusts), block = late_meta$sample, direction = "up", pval.type = "all", subset.row = calcAverage(late_sce) > 0.1)
top = sapply(markers, function(x){
genes[match(head(rownames(x), n = 10), genes[,1]), 2]
})
kable(top, caption = "Top marker genes for the gut clusters.")
| 1 | 2 | 3 | 4 | 5 | 6 | 7 |
|---|---|---|---|---|---|---|
| Fzd2 | Nkx2-3 | Hhex | Nudt11 | Cdx1 | Cxcl12 | Trap1a |
| Osr1 | Isl1 | Cdkn1c | Col4a1 | Tpd52l1 | Glud1 | Rhox5 |
| Hoxa1 | Nkx2-5 | Ttr | Nepn | Cited1 | 2610528A11Rik | Itm2b |
| Hoxb1 | Kctd12b | Smarcd3 | Col4a2 | Wnt6 | Rbpms2 | Ypel3 |
| Ripply3 | Pgk1 | Ddt | Sesn3 | Ncl | Kitl | Xlr3a |
| Hmgn3 | Tnnt1 | Sfrp5 | Prss8 | Hoxb2 | Sct | Mgst3 |
| Irx2 | Gfpt2 | Phlda2 | Cst3 | Zfp503 | B4galt6 | Sccpdh |
| Sulf1 | Cd24a | Fam210b | Pdzk1ip1 | Stra6 | Dnajc19 | Tfpi |
| Irx1 | Foxe1 | Bambi | Hacd4 | Tmem121 | Hoxa10 | Arpc1b |
| Cadm1 | Has2 | Col1a2 | Fth1 | Mad2l1 | Jak1 | Trf |
# test = lapply(markers, function(x){
# hit = head(rownames(x), n = 30)
# enrich_genes_goana(hit, rownames(x), warn_missing = FALSE)
# })
#
# test = lapply(test, function(x) x$result)
Cluster 3 very highly expresses the gene gene Hhex, which marks the foregut. This is shown in Figure 16.
expr_boxplot = function(gene, clusters = late_clusts, sce = late_sce, genes_df = genes){
pdf = data.frame(clust = clusters, expr = logcounts(sce)[match(gene, genes_df[,2]),])
p = ggplot(pdf, aes(x = factor(clust), y = expr)) +
geom_boxplot() +
labs(x = "Cluster", y = paste(gene, "log2 expression"))
return(p)
}
gut_plot = function(gene_name){
p1 = plot_expr(late_dmap$DC1, late_dmap$DC2, gene_name, c("DC1", "DC2"), late_sce)
p2 = expr_boxplot(gene_name)
plot_grid(p1, p2)
}
gut_plot("Hhex")
Figure 16: Hhex expression in the gut cell subset
This cluster also expresses the gene Ttr (Figure 17, a liver progenitor marker), and the ventral foregut marker Gata4 (Figure 18; ref1 ref2). We label cluster 3 as Foregut 1.
gut_plot("Ttr")
Figure 17: Ttr expression in the gut cell subset
cluster_labels = c()
gut_plot("Gata4")
Figure 18: Gata4 expression in the gut cell subset
cluster_labels = c(cluster_labels, "3" = "Foregut 1")
frac_3 = sum(late_meta$stage == "E8.25" & late_clusts == 3)/sum(late_clusts == 3)
frac_1 = sum(late_meta$stage == "E8.25" & late_clusts == 1)/sum(late_clusts == 1)
Cluster 1 expresses Irx1, linked to lung development (Figure 19; reference) and Hmgn3 and, linked to pancreas development (Figure 20; reference). Note also that this cluster contains a considerably larger fraction of E8.25 cells than cluster 3 (45% vs 30%), and it is therefore likely that this cluster also contains younger foregut cells. We label these cells as Foregut 2.
gut_plot("Irx1")
Figure 19: Irx1 expression in the gut cell subset
gut_plot("Hmgn3")
Figure 20: Hgmn3 expression in the gut cell subset
cluster_labels = c(cluster_labels, "1" = "Foregut 2")
Cluster 4 highly expresses Nepn (a midgut marker, Figure 21, ref), so we label these cells as midgut.
gut_plot("Nepn")
Figure 21: Nepn expression in the gut cell subset
cluster_labels = c(cluster_labels, "4" = "Midgut")
Both clusters 7 and 5 highly express Cdx2, a marker of the hindgut (Figure 22). We label these clusters as Hindgut 1 and 2 respectively
gut_plot("Cdx2")
Figure 22: Cdx2 expression in the gut cell subset
cluster_labels = c(cluster_labels, "7" = "Hindgut 1", "5" = "Hindgut 2")
Cluster 2 expresses Nkx2-5 (Figure 23), which indicates that it is pharyngeal endoderm.
gut_plot("Nkx2-5")
Figure 23: Nkx2-5 expression in the gut cell subset
cluster_labels = c(cluster_labels, "2" = "Pharyngeal endoderm")
Cluster 6 sits between the midgut and hindgut clusters, but note that it expresses intermediate levels of the markers for each (Figures 22 for Cdx2, 21 for Nepn). We therefore label these cells as intermediate midgut/hindgut cells.
cluster_labels = c(cluster_labels, "6" = "Midgut/Hindgut")
cluster_labels = cluster_labels[order(names(cluster_labels))]
cluster_colours = c(brewer_pal(palette = "Dark2")(length(cluster_labels)))
names(cluster_colours) = names(cluster_labels)
cluster_levels = c(7, 5, 6, 4, 1, 3, 2)
A summary of these classifications is shown in Figure 24, including some additional marker genes.
get_gene_means = function(gene, gene_df = genes, sce_obj = sce, celltypes = meta$celltype){
ens = as.character(genes[match(gene, genes[,2]),1])
expr = as.numeric(logcounts(sce_obj[ens,]))
medians = aggregate(formula = expr ~ celltypes, FUN = mean)
mat = matrix(medians$expr, nrow = 1, dimnames = list(gene, medians$celltypes))
return(mat)
}
targets = c("Nepn", "Cdx2", "Nkx2-5", "Isl1", "Hmgn3", "Irx1", "Gata4", "Ttr", "Hhex", "Foxe1", "Pax9")
means = lapply(targets, get_gene_means, sce_obj = late_sce, celltypes = cluster_labels[as.character(late_clusts)])
combined = do.call(rbind,means)
combined = sweep(combined, 1, apply(combined, 1, max), "/")
# heatmap_late = pheatmap(combined, cluster_rows = TRUE, cluster_cols = TRUE, color = viridis::viridis(100))
geneclust = hclust(dist(combined))
gene_order = geneclust$labels[geneclust$order]
clust_order = c("Hindgut 1", "Hindgut 2", "Midgut/Hindgut", "Midgut", "Foregut 1", "Foregut 2", "Pharyngeal endoderm")
pdf = melt(combined[gene_order, clust_order])
heatmap_late = ggplot(pdf, aes(x = Var2, y = Var1, fill = value)) +
geom_tile() +
scale_fill_viridis(labels = c(0,"Max."), breaks = c(0,1), name = "Mean\nexpression") +
theme(axis.title = element_blank(),
axis.line = element_blank(),
axis.ticks = element_blank(),
axis.text.x = element_text(angle = -30, hjust = 0))
heatmap_late
Figure 24: Expression patterns for each annotated cluster is shown
The mean log-expression is plotted, normalised by the maximum mean expression per gene.
This gives the following labelling of the late gut cells, shown in Figure 25
late_dmap_labelled = qplot(late_dmap$DC1, late_dmap$DC2, col = factor(late_clusts)) +
scale_colour_manual(name = "Cluster", values = cluster_colours, labels = cluster_labels) +
guides(colour = guide_legend(override.aes = list(size = 5))) +
labs(x = "DC1", y = "DC2")
late_dmap_labelled
Figure 25: Diffusion map of gut cells, coloured and labelled by assigned cluster
Finally, we show and enlarged heatmap in Figure 26 that includes the top 10 unqiuely expressed genes for each cluster (as found using scran::findMarkers).
targets = as.character(top)
means = lapply(targets, get_gene_means, sce_obj = late_sce, celltypes = cluster_labels[as.character(late_clusts)])
combined = do.call(rbind,means)
combined = sweep(combined, 1, apply(combined, 1, max), "/")
# heatmap_late = pheatmap(combined, cluster_rows = TRUE, cluster_cols = TRUE, color = viridis::viridis(100))
geneclust = hclust(dist(combined))
gene_order = geneclust$labels[geneclust$order]
clust_order = c("Hindgut 1", "Hindgut 2", "Midgut/Hindgut", "Midgut", "Foregut 1", "Foregut 2", "Pharyngeal endoderm")
pdf = melt(combined[gene_order, clust_order])
ggplot(pdf, aes(x = Var2, y = Var1, fill = value)) +
geom_tile() +
scale_fill_viridis(labels = c(0,"Max."), breaks = c(0,1), name = "Mean\nexpression") +
theme(axis.title = element_blank(),
axis.line = element_blank(),
axis.ticks = element_blank(),
axis.text.x = element_text(angle = -30, hjust = 0))
Figure 26: The expression of the top 10 DE genes for each cluster is shown
The embryonic gut runs along the length of the embryo. Given the whole-embryo sampling that we have performed, it should be possible to use the transcriptional heterogeneity observed in the gut cells to order them along this axis. To do this, we consider only the E8.5 cells (to exclude effects of development over time), and recomputed a diffusion map with these cells. To order the cells, we computed DPT from the pharyngeal endoderm cell with most extreme DC2 value. In Figure 27, the first two components of the diffusion map are shown, the DPT on these diffusion components, as well as the density of cells of different celltypes along the DPT.
keep = late_meta$stage == "E8.5"
gut_sce = scater::normalize(late_sce[,keep])
gut_meta = late_meta[keep,]
gut_corrected = late_corrected[keep,]
gut_clusts = late_clusts[keep]
set.seed(42)
gut_dmap = DiffusionMap(gut_corrected)
gut_dpt = DPT(gut_dmap)
gut_dpt = gut_dpt[[paste0("DPT", which.min(gut_dmap$DC2))]]
p1 = ggplot(mapping = aes(x = gut_dmap$DC1, y= gut_dmap$DC2, col = factor(gut_clusts)))+
geom_point() +
geom_point(mapping = aes(x = gut_dmap$DC1[which.min(gut_dmap$DC2)], y = min(gut_dmap$DC2)), col = "black", size = 3, inherit.aes = FALSE) +
scale_color_manual(values = cluster_colours, labels = cluster_labels, name = "") +
labs(x = "DC1", y = "DC2") +
theme(legend.position = "none")
p2 = ggplot(mapping = aes(x = gut_dmap$DC1, y= gut_dmap$DC2, col = gut_dpt))+
geom_point() +
scale_color_viridis(name = "DPT") +
labs(x = "DC1", y = "DC2")
gut_density = ggplot(mapping = aes(x = gut_dpt, fill = factor(gut_clusts, levels = cluster_levels, ordered = TRUE)))+
geom_density(alpha = 0.5) +
scale_fill_manual(values = cluster_colours, labels = cluster_labels, name = "") +
theme(legend.position = c(0.0, 0.8))
plot_grid(plot_grid(p1, p2, labels = "AUTO"),
gut_density, nrow = 2, labels = c(NULL, "C"))
Figure 27: Ordering of gut cells along diffusion components
A: Gut cells projected via diffusion maps. The black coloured cell indicates the DPT starting position. B: Gut cells projected via diffusion maps, coloured by DPT value. C: Ordering of gut clusters along DPT.
ggsave(gut_density,
file = "/nfs/research1/marioni/jonny/embryos/scripts/endoderm/plots/gut_spatial_density.pdf",
width = 6, height = 4)
Having ordered cells along the first diffusion component, we now seek to identify how different genes are changing along this path. These genes may help to explain the molecular underpinnings of the contribution of these two groups of cells. We use the following procedure (similar to Ibarra-Soria et al. 2017 ):
Calculate highly variable genes for the cell subset. Retain only highly-variable genes. Genes that are not especially variable over these set of cells are unlikely to be variable over DPT.
For each retained gene, create two linear models that regress the log2 normalised expression levels for each cell against the values of DPT calculated above. The first model considers the expression as a constant across all DPT values. The second model considers expression as a degree-2 polynomial function of the DPT value. Retain the genes for which the degree-2 polynomial fits the data better (F-test p < 0.05, R function anova.lm). We consider these genes to vary over DPT.
To each gene, fit a local regression to the expression level for each cell at each value of DPT (R function loess, span = 0.75). Store the predicted value of the loess fit for one thousand equally spaced points across the DPT to smooth the data. By using equally spaced points, this prevents gene clustering being determined exclusively by locations with high cell density.
Normalise the fitted trends such that the minimum value is 0, and the maximum value is 1. This prevents clustering of genes according to global expression levels, rather than local expression patterns.
To cluster the fitted expression profiles, calculate the pearson correlation distance between each gene, and perform hierarchical clustering using the average agglomeration method (UPGMA). Cut the tree using the R package dynamicTreeCut, with a minimum cluster size of 50 genes.
The gene clusters identified in this manner are shown in Figure 28.
#This function determines the expression trajectories for clustering
get_predict = function(count, dpt,
model_mode = c("loess", "quadratic"),
predict_mode = c("cells", "even")){
if(model_mode[1] == "loess"){
model = loess(count ~ dpt)
newdat = data.frame(dpt = dpt)
if(predict_mode[1] == "even"){
newdat = data.frame(dpt = seq(from = min(dpt), to = max(dpt), length.out = 1000))
}
} else if(model_mode[1] == "quadratic"){
dpt2 = dpt^2
model = lm(count ~ dpt + dpt2)
newdat = data.frame(dpt = dpt, dpt2 = dpt2)
if(predict_mode[1] == "even"){
newdat = data.frame(dpt = seq(from = min(dpt), to = max(dpt), length.out = 1000),
dpt2 = seq(from = min(dpt), to = max(dpt), length.out = 1000)^2)
}
}
pred = predict(model, newdata = newdat)#, dpt2 = coords^2))
return(pred)
}
#This function identtifies genes that vary, and performs clustering
get_gene_clusters = function(sce, dpt, seed = 42, predict_even = TRUE, normalise = TRUE, model_mode = c("loess", "quadratic")){
require(dynamicTreeCut)
set.seed(seed)
#find genes that vary along DPT
dpt2 = dpt^2
retain = sapply(1:nrow(sce), function(x){
expr = logcounts(sce)[x,]
mod = lm(expr ~ dpt + dpt2)
base = lm(expr ~ 1)
# dAIC = AIC(mod) - AIC(base)
# return(dAIC < -25)
anova = anova(base,mod)
return(anova$`Pr(>F)`[2])
})
retain = p.adjust(retain, "fdr") < 0.05
sce = sce[retain,]
#fit the genes that vary
predict_mode = ifelse(predict_even, "even", "cells")
predicts = lapply(1:nrow(sce), function(x) get_predict(as.numeric(logcounts(sce)[x,]),
dpt,
predict_mode = predict_mode,
model_mode = model_mode[1]))
predicts = do.call(cbind, predicts)
if(normalise){
#shift minimum value to 0
predicts = sweep(predicts,
2,
matrixStats::colMins(predicts),
"-")
#scale to max 1
predicts = sweep(predicts,
2,
matrixStats::colMaxs(predicts),
"/")
}
colnames(predicts) = rownames(sce)
#cluster
dm = sqrt( (1- cor(predicts, method = "pearson")) /2)
# dm = as.matrix(dist(t(predicts)))
hclust = hclust(as.dist(dm), method = "average")
clusts = cutreeDynamic(dendro = hclust, minClusterSize = 50, distM = dm)
names(clusts) = colnames(predicts)
return(list(clusters = clusts, hclust = hclust, predictions = predicts, dpt = dpt, expression = logcounts(sce)))
}
plot_cluster_averages = function(get_gene_clusters_out, predict_even = FALSE){
input = get_gene_clusters_out
means = sapply(1:length(unique(input$clusters)), function(x){
rowMeans(input$predictions[,input$clusters==x])
})
pdf = melt(means)
names(pdf) = c("cellnum", "cluster", "val")
if(predict_even){
pdf$dpt = pdf$cellnum
} else{
pdf$dpt = input$dpt[pdf$cellnum]
}
pdf$sd = sapply(1:nrow(pdf), function(x){
sd(input$predictions[pdf$cellnum[x],
input$clusters == pdf$cluster[x]])
})
plots = lapply(1:length(unique(pdf$cluster)), function(x){
ggplot(pdf[pdf$cluster == x,], aes(x= dpt, y = val)) +
geom_ribbon(mapping = aes(ymin = val-sd, ymax = val + sd), fill = "grey80") +
geom_line() +
theme(axis.text = element_blank(), axis.title = element_blank(), axis.ticks = element_blank()) +
ggtitle(paste0(x, " (n=", sum(input$cluster==x),")" ))
})
return(plot_grid(plotlist = plots))
}
gut_hvgs = getHVGs(gut_sce)
clusters_gut = get_gene_clusters(gut_sce[gut_hvgs,],
gut_dpt,
seed = 42,
predict_even = TRUE,
normalise = TRUE,
model_mode = "loess")
clusters_gut_plot = plot_cluster_averages(clusters_gut, predict_even = TRUE)
clusters_gut_plot
Figure 28: Summaries of the detected gut gene clusters are shown
The black line indicated the mean value across genes for every observed DC1 coordinate; grey shading indicates the standard deviation at each value.
ggsave(clusters_gut_plot,
file = "/nfs/research1/marioni/jonny/embryos/scripts/endoderm/plots/gut_gene_clusters.pdf",
width = 8, height = 8)
df = data.frame(ensembl = names(clusters_gut$clusters),
mgi = genes[match(names(clusters_gut$clusters), genes[,1]),2],
cluster = clusters_gut$clusters)
write.table(df, row.names = FALSE, quote = FALSE, sep = ",", file = "/nfs/research1/marioni/jonny/embryos/scripts/endoderm/gut_spatial_gene_clustering.csv")
It has been previously reported that cells from the visceral endoderm make contributions to the embryonic gut (Kwon et al. 2008). Moreover, the greatest contribution of these cells has been observed in the Hindgut (at E8.75). Can we see this in our data?
To probe this further, we will build a transport map between cells at different timepoints using Waddington-OT. In addition, we exclude the Mixed timepoint (as these cells were not synchronously harvested), and we add 100 extraembryonic endoderm cells from each timepoint, to allow cell mass from the visceral endoderm to contribute to both extrambryonic tissues and the embryonic gut.
A diffusion map of these cells is shown in Figure 29. A batch corrected PCA was recomputed for these cells, which is used to build the transport map.
keep = meta$cell %in% sub_meta$cell & meta$stage != "mixed_gastrulation"
set.seed(42)
alt = lapply(unique(sub_meta$stage), function(x){
sample(which(meta$celltype == "ExE endoderm" & meta$stage == x),
min(c(100, sum(meta$celltype == "ExE endoderm" & meta$stage == x, na.rm = TRUE))))
})
alt = do.call(c, alt)
keep[alt] = TRUE
keep[meta$stage == "mixed_gastrulation"] = FALSE
big_sce = scater::normalize(sce[, keep])
big_meta = meta[keep,]
big_hvgs = getHVGs(big_sce)
order_df = big_meta[!duplicated(big_meta$sample), c("stage", "sample")]
order_df$ncells = sapply(order_df$sample, function(x) sum(big_meta$sample == x))
order_df$stage = factor(order_df$stage,
levels = rev(c("E8.5",
"E8.25",
"E8.0",
"E7.75",
"E7.5",
"E7.25",
"mixed_gastrulation",
"E7.0",
"E6.75",
"E6.5")))
order_df = order_df[order(order_df$stage, order_df$ncells, decreasing = TRUE),]
order_df$stage = as.character(order_df$stage)
set.seed(42)
big_corrected = doBatchCorrect(counts = logcounts(big_sce)[rownames(big_sce) %in% big_hvgs,],
timepoints = big_meta$stage,
samples = big_meta$sample,
timepoint_order = order_df$stage,
sample_order = order_df$sample,
npc = 50)
## Warning in queryKNN(data2, query = data1, k = k1, BPPARAM = BPPARAM,
## get.distance = FALSE): 'k' capped at the number of observations
big_dmap = DiffusionMap(big_corrected)
save(big_corrected, big_sce, big_meta, big_dmap, file = "/nfs/research1/marioni/jonny/embryos/scripts/endoderm/big_data.RData")
load("/nfs/research1/marioni/jonny/embryos/scripts/endoderm/big_data.RData")
exe = big_meta$cell[big_meta$stage %in% c("E8.25", "E8.5") &
big_meta$celltype == "ExE endoderm"]
ro = sample(nrow(big_meta), nrow(big_meta))
big_celltype = qplot(big_dmap$DC1[ro], big_dmap$DC2[ro], col = big_meta$celltype[ro]) +
scale_colour_manual(values = celltype_colours, name = "Celltype")+
labs(x = "DC1", y = "DC2") +
guides(col = guide_legend(override.aes = list(size = 3)))
big_stage = qplot(big_dmap$DC1[ro], big_dmap$DC2[ro], col = big_meta$stage[ro]) +
scale_colour_manual(values = stage_colours, name = "Stage")+
labs(x = "DC1", y = "DC2") +
guides(col = guide_legend(override.aes = list(size = 3)))
plot_grid(big_celltype, big_stage, ncol = 1, labels = "AUTO", align = "v")
Figure 29: Cells retained for transport map, coloured by celltype (A) and stage (B)
ggsave(big_celltype,
file = "/nfs/research1/marioni/jonny/embryos/scripts/endoderm/plots/dmap_exe_celltype.pdf",
width = 5, height = 5)
ggsave(big_stage,
file = "/nfs/research1/marioni/jonny/embryos/scripts/endoderm/plots/dmap_exe_stage.pdf",
width = 5, height = 5)
We now build the transport map.
# wot_pca = corrected
wot_pca = big_corrected
rownames(wot_pca) = NULL
colnames(wot_pca) = paste0("PC", 1:50)
wot_pca = cbind(data.frame("id" = big_meta$cell), as.data.frame(wot_pca))
wot_pca_loc = "/nfs/research1/marioni/jonny/embryos/scripts/endoderm/wot/pca.tab"
write.table(wot_pca,
file = wot_pca_loc,
col.names = TRUE,
row.names = FALSE,
quote = FALSE,
sep = "\t")
#prepare day metadata
wot_days = data.frame(id= big_meta$cell, day = substr(big_meta$stage, start = 2, stop = nchar(big_meta$stage)))
wot_day_loc = "/nfs/research1/marioni/jonny/embryos/scripts/endoderm/wot/days.tab"
write.table(wot_days,
file = wot_day_loc,
col.names = TRUE,
row.names = FALSE,
quote = FALSE,
sep = "\t")
#compute the transport maps
wot_out_loc = "/nfs/research1/marioni/jonny/embryos/scripts/endoderm/wot/"
command = paste("wot optimal_transport --max_threads 1 --local_pca 0 --out", wot_out_loc, "--format loom --matrix", wot_pca_loc, "--cell_days", wot_day_loc)
#this was run separately
# system(command)
What can we do using these transport maps? First, we will move forward through the maps, identifying how cell mass falls in the gut clusters (identified above) from populations of visceral and definitive endoderm cells at E7.0 (where the definitive endoderm set also includes anterior primitive streak cells). The cells selected, and results of the analysis, are shown in Figure 30. We classified cells as visceral endoderm-derived if they derived more mass from the visceral endoderm starting population than the definitive endoderm population.
#cells at which stages
ve_cells = big_meta$cell[big_meta$stage == "E7.0" & big_meta$celltype == "Visceral endoderm"]
string1 = paste("VE", "VE", paste0(ve_cells, collapse = "\t"), sep = "\t")
de_cells = big_meta$cell[big_meta$stage == "E7.0" & (big_meta$celltype == "Def. endoderm" |
big_meta$celltype == "Anterior Primitive Streak")]
string2 = paste("DE", "DE", paste0(de_cells, collapse = "\t"), sep = "\t")
wot_target_loc = "/nfs/research1/marioni/jonny/embryos/scripts/endoderm/wot/forward_targets.gmt"
wot_forward_out = "/nfs/research1/marioni/jonny/embryos/scripts/endoderm/wot/forward_output.txt"
write.table(c(string1, string2), file = wot_target_loc, col.names = FALSE, row.names = FALSE, quote = FALSE, sep = "\t")
command = paste("wot trajectory --tmap", wot_out_loc, "--cell_set", wot_target_loc, "--time 7.0 --out", wot_forward_out)
#this was run separately
# system(command)
col = sapply(sub_meta$cell, function(x){
if(x %in% ve_cells){
return("VE")
} else if(x %in% de_cells){
return("DE")
} else {
return("Neither")
}
})
col = factor(col, levels = c("Neither", "VE", "DE"))
order = order(col)
p1= ggplot(graph_df[order,], aes(x = gephiX, y= gephiY, col = col[order])) +
geom_point() +
scale_colour_manual(values = c("DE" = "seagreen4", "VE" = "coral", "Neither" = "darkgrey"), name = "") +
theme(axis.text = element_blank(), axis.title = element_blank(), axis.ticks = element_blank())
wot_forward = read.table(wot_forward_out, sep = "\t", header = TRUE, stringsAsFactors = FALSE)
sub_meta$forward_ratio = (wot_forward$DE/wot_forward$VE)[match(sub_meta$cell, wot_forward$id)]
p2 = ggplot(graph_df, aes(x = gephiX, y= gephiY, col = sub_meta$forward_ratio > 1)) +
geom_point() +
scale_colour_manual(values = c("TRUE" = "seagreen4", "FALSE" = "coral"), labels = c("TRUE" = "DE", "FALSE" = "VE"), name = "")+
theme(axis.text = element_blank(), axis.title = element_blank(), axis.ticks = element_blank())
late_meta$cluster.name = cluster_labels[late_clusts]
late_meta$origin = ifelse(sub_meta$forward_ratio[match(late_meta$cell, sub_meta$cell)] > 1, "DE", "VE")
tab = table(late_meta$cluster.name, late_meta$origin)
tab = sweep(tab, 1, rowSums(tab), "/")
df = as.data.frame(tab)
p3 = ggplot(data = df, mapping = aes(x = Var1, fill = Var2, y = Freq)) +
geom_bar(stat = "identity", position = "stack") +
scale_fill_manual(values = c("DE" = "seagreen4", "VE" = "coral"), name = "") +
theme(axis.text.x = element_text(angle = 30, hjust = 1),
axis.title.x = element_blank())
p4 = ggplot(mapping = aes(x = late_meta$cluster.name, y = sub_meta$forward_ratio[match(late_meta$cell, sub_meta$cell)])) +
geom_violin() +
scale_y_log10() +
theme(axis.text.x = element_text(angle = 30, hjust = 1),
axis.title.x = element_blank()) +
geom_hline(yintercept = 1) +
labs(y = "DE mass/VE mass")
plot_grid(p1, p2, p3,p4, labels = "AUTO")
Figure 30: Results of pushing forward through the transport map
A: Starting cells at E7.0 are shown. B: Cell allocations to the according to the mass contributed from starting populations. C: Number of cells allocated to each origin in the mature gut clusters. D: Distributions of mass ratios in the late gut clusters.
ggsave(p1,
file = "/nfs/research1/marioni/jonny/embryos/scripts/endoderm/plots/wot_forward_starting.pdf",
width = 6, height = 6)
ggsave(p2,
file = "/nfs/research1/marioni/jonny/embryos/scripts/endoderm/plots/wot_forward_result.pdf",
width = 6, height = 6)
ggsave(p3,
file = "/nfs/research1/marioni/jonny/embryos/scripts/endoderm/plots/wot_forward_barplot.pdf",
width = 6, height = 6)
Particularly, note that one hindgut cluster is composed almost entirely of cells that are predicted to derive from the visceral endoderm. These may be the cells described Kwon et al., and we will investigate these in the next section.
Instead of looking forward through the map, we could look backward from the cells in E8.5 endpoints that we have defined in the embyronic gut, above. These cells are shown in Figure 31, excluding the ExE endoderm.
cols = late_clusts[match(sub_meta$cell, late_meta$cell)]
cols = cluster_labels[match(cols, names(cluster_labels))]
cols[is.na(cols)] = "Not terminus"
cols = factor(cols, levels = c(cluster_labels, "Not terminus"), ordered = TRUE)
order = order(cols, decreasing = TRUE)
late_cols = cluster_colours
names(late_cols) = cluster_labels[names(cluster_colours)]
graph_endpoints = ggplot(graph_df[order,], aes(x = gephiX, y= gephiY, col = cols[order])) +
geom_point() +
scale_colour_manual(values = c(late_cols, "Not terminus" = "darkgrey"), name = "Endpoint") +
theme(axis.text = element_blank(), axis.title = element_blank(), axis.ticks = element_blank())
graph_endpoints
Figure 31: Endpoints on the force-directed layout
targets = lapply(1:7, function(x){
target_cells = late_meta$cell[late_clusts == x & late_meta$stage == "E8.5"]
string = paste(cluster_labels[x], cluster_labels[x], paste(target_cells, collapse="\t"), sep = "\t")
return(string)
})
targets[[8]] = paste("ExE endoderm",
"ExE endoderm",
paste(big_meta$cell[big_meta$celltype == "ExE endoderm" &
big_meta$stage == "E8.5"], collapse="\t"),
sep = "\t")
comb = do.call(c, targets)
wot_target_loc = "/nfs/research1/marioni/jonny/embryos/scripts/endoderm/wot/backward_targets.gmt"
wot_backward_out = "/nfs/research1/marioni/jonny/embryos/scripts/endoderm/wot/backward_output.txt"
write.table(comb, file = wot_target_loc, col.names = FALSE, row.names = FALSE, quote = FALSE, sep = "\t")
command = paste("wot trajectory --tmap", wot_out_loc, "--cell_set", wot_target_loc, "--time 8.5 --out", wot_backward_out)
#this was run separately
# system(command)
wot_backward = read.table(wot_backward_out, header = TRUE, sep = "\t", stringsAsFactors = FALSE)
ggsave(graph_endpoints,
file = "/nfs/research1/marioni/jonny/embryos/scripts/endoderm/plots/wot_backward_endpoints.pdf",
width = 6, height = 6)
In Figure 32, we show results for this approach to the map. Specifically, we show the fraction of cells at each timepoint which showed the greatest mass contribution to each of the celltype termini. However, we note that different cells may be called with a different level of confidence. We have a measure of the confidence of our trajectory assignments by considering the ratio of the maximum mass for each cell towards each endpoint (i.e. the one to which it was allocated) by the median mass for each cell across all endpoints. Where most mass derives from one or a few endpoints, the assignment is more likely to be correct; where mass derives from very many endpoints, it is likely that the cell is in an uncommitted state. Notably, the pharyngeal endoderm (to which many DE cells have been assigned) has very low levels of confidence by this measure.
wot_backward$stage = big_meta$stage[match(wot_backward$id, big_meta$cell)]
wot_backward$celltype = big_meta$celltype[match(wot_backward$id, big_meta$cell)]
wot_backward$max.mass = sapply(1:nrow(wot_backward), function(x){
row = as.numeric(wot_backward[x, 2:9])
names(row) = names(wot_backward)[2:9]
return(names(which.max(row)))
})
wot_backward$max.mass = gsub(".", " ", wot_backward$max.mass, fixed = TRUE)
wot_backward$max.mass[wot_backward$max.mass == "Midgut Hindgut"] = "Midgut/Hindgut"
wot_backward$confidence = sapply(1:nrow(wot_backward), function(x){
row = as.numeric(wot_backward[x, 2:9])
# return(max(row)/max(row[-which.max(row)]))
return(max(row)/median(row))
})
wot_backward$hg2.score = sapply(1:nrow(wot_backward), function(x){
row = as.numeric(wot_backward[x, 2:9])
names(row) = names(wot_backward)[2:9]
return(row["Hindgut.2"]/max(row))
})
p1 = ggplot(data = wot_backward, mapping = aes(x = stage, fill = max.mass)) +
geom_bar() +
scale_fill_manual(values = c(late_cols, "ExE endoderm" = "grey20"), name = "Max mass") +
facet_wrap(~celltype) +
theme(axis.text.x = element_text(angle = 90, hjust= 1, vjust = 0.5),
axis.title.x = element_blank())
p2 = ggplot(data = wot_backward, mapping = aes(x = max.mass, y = confidence)) +
geom_boxplot() +
theme(axis.text.x = element_text(angle = 90, hjust= 1, vjust = 0.5)) +
scale_y_log10() +
coord_cartesian(ylim = c(1, 10)) +
labs(x = "Trajectory assignment", y = "Max mass/Median mass")
plot_grid(p1, p2, nrow = 2, labels = "AUTO")
## Warning: Removed 678 rows containing non-finite values (stat_boxplot).
Figure 32: Results of pulling backward through the transport map
A: The terminal clusters to which cells of different timepoints and stages derive the most mass. B: A measure of confidence of each assignment is shown, for cells of all timepoints assigned to each endpoint with respect to the maximum mass contributed.
The assignments made according to maximum mass are shown in Figure ?? on the force-directed layout.
graph_backwards = ggplot(graph_df, aes(x = gephiX, y= gephiY, col = wot_backward$max.mass[match(sub_meta$cell, wot_backward$id)])) +
geom_point() +
scale_colour_manual(values = c(late_cols, "ExE endoderm" = "grey20"), name = "Max mass") +
theme(axis.text = element_blank(), axis.title = element_blank(), axis.ticks = element_blank())
graph_backwards
We will return to classification of these trajectories in the final part of our analyses in this document.
We note that the Hindgut 1 cluster appears to receive large contributions from the visceral endoderm in our transport map analyses, whereas Hindgut 2 does not.
NA
combs = combn(names(cluster_labels), 2)
combs_name= matrix(cluster_labels[combs], nrow = 2, byrow=FALSE)
do_de = function(clust1, clust2){
wee_sce = scater::normalize(late_sce[,late_clusts %in% c(clust1, clust2)])
wee_meta = late_meta[late_clusts %in% c(clust1, clust2),]
avg = calcAverage(wee_sce)
de = findMarkers(wee_sce,
clusters = late_clusts[late_clusts %in% c(clust1, clust2)],
block = factor(wee_meta$sample),
full.stats = TRUE,
lfc = 0.5)
de = de[[1]][[4]]
de = de[rownames(de) %in% names(avg[avg > 0.1]),]
de$p.value = exp(de$log.p.value)
de$FDR = p.adjust(de$p.value, method = "fdr")
de$mgi = genes[match(rownames(de), genes[,1]),2]
return(de)
}
des = apply(combs, 2, function(x) do_de(x[1], x[2]))
# # fracde = sapply(des, function(x) sum(x$FDR < 0.1)/nrow(x))
# nde = sapply(des, function(x) sum(x$FDR < 0.1))
# # nfc = sapply(des, function(x) sum(x$logFC > 0.5 | x$logFC < -0.5))
#
# df = data.frame(c1 = combs[1,], c2 = combs[2,], nde = nde)
#
# df$hg1 = ifelse(df$c1 == 7 | df$c2 == 7, TRUE, FALSE)
#
# ggplot(df, aes(x = hg1, y = df$nde, col = hg1)) +
# geom_violin()
#
# tab = acast(data = df, formula = c1 ~ c2, value.var = "nde")
# tab = cbind(rep(NA, 6),tab)
# tab = rbind(tab, rep(NA, 7))
# colnames(tab)[1] = 1
# rownames(tab)[nrow(tab)] = 7
# diag(tab) = 0
#
# colnames(tab) = cluster_labels[as.character(colnames(tab))]
# rownames(tab) = cluster_labels[as.character(rownames(tab))]
#
# pheatmap(tab)
hind_de = des[[which(apply(combs, 2, function(x) all(c(7,5) %in% x)))]]
fore_de = des[[which(apply(combs, 2, function(x) all(c(1,3) %in% x)))]]
hits = c(as.character(hind_de$mgi[hind_de$logFC > 0][1:5]),
as.character(hind_de$mgi[hind_de$logFC < 0][1:5]))
labs = hind_de[hind_de$mgi %in% hits,]
de_hg = ggplot(as.data.frame(hind_de), aes(x = logFC, y = -log10(p.value), col = FDR < 0.1 ,
size = as.character((FDR < 0.1) + (mgi %in% labs$mgi)))) +
geom_point() +
scale_colour_manual(values = c("TRUE" = "red", "FALSE" = "black")) +
scale_size_manual(values = c("1" = 1, "0" = 0.2, "2" = 2)) +
labs(x = "logFC (Hindgut 1/Hindgut 2)", y = "-log10(p)") +
geom_text_repel(data = as.data.frame(labs),
mapping = aes(x = logFC, y= -log10(p.value), label = mgi),
nudge_y = 1.5,
nudge_x = 0.1,
inherit.aes = FALSE) +
theme(legend.position = "none")
de_hg
Figure 33: Volcano plot of diffiential expression in the two terminal hindgut clusters
Genes on the right of the plot are more highly expressed in the embryonic derived hindgut. Genes highlighted in red passed significance testing (FDR-adjusted p < 0.1). Genes with a minimum mean normalised count of 0.1 across both clusters were considered for DE analysis. The four labelled genes are mentioned below.
ggsave(de_hg,
file = "/nfs/research1/marioni/jonny/embryos/scripts/endoderm/plots/hindgut_ve_volcano.pdf",
width = 6, height = 6)
de_fg = ggplot(as.data.frame(fore_de), aes(x = logFC, y = -log10(p.value), col = FDR < 0.1 ,
size = as.character((FDR < 0.1) + (mgi %in% labs$mgi)))) +
geom_point() +
scale_colour_manual(values = c("TRUE" = "red", "FALSE" = "black")) +
scale_size_manual(values = c("1" = 1, "0" = 0.2, "2" = 2)) +
labs(x = "logFC (Foregut 2/Foregut 1)", y = "-log10(p)") +
theme(legend.position = "none")
# genes_ve = rownames(hind_de)[hind_de$FDR < 0.1 & hind_de$logFC < 0]
# enrich_ve = enrich_genes_goana(genes_ve, rownames(hind_de), warn_missing = FALSE)$result
#
# genes_de = rownames(hind_de)[hind_de$FDR < 0.1 & hind_de$logFC > 0]
# enrich_de = enrich_genes_goana(genes_de, rownames(hind_de), warn_missing = FALSE)$result
#
# x <- counts(hind_sce[calcAverage(hind_sce) > 1,])
# group <- factor(late_clusts[late_clusts %in% c(5,7)])
# y <- DGEList(counts=x,group=group)
# y <- calcNormFactors(y)
# design <- model.matrix(~group)
# y <- estimateDisp(y,design)
#
#
# fit <- glmFit(y,design)
# lrt <- glmLRT(fit,coef=2)
# out = lrt$table
# out$mgi = genes[match(rownames(out), genes[,1]), 2]
# out = out[order(out$PValue),]
# out$fdr = p.adjust(out$PValue, method = "fdr")
#
#
# out = glmTreat(fit)
#
# v = voom(y, design)
# fit <- lmFit(v, design)
# fit <- eBayes(fit)
# topTable(fit, coef=ncol(design))
# padj = p.adjust(fit$p.value[,2], method = "fdr")
By comparison, the DE between the two foregut clusters is also shown in Figure 34.
lims = lims(x = c(min(c(fore_de$logFC, hind_de$logFC)),
max(c(fore_de$logFC, hind_de$logFC))),
y = c(0, max(c(-log10(fore_de$p.value),
-log10(hind_de$p.value)))))
grid = plot_grid(de_hg + lims,
de_fg + lims)
grid
Figure 34: DE between the Foregut and Hindgut clusters
The two genes that are by far most highly differentially expressed genes are X-linked (Trap1a and Rhox5, as has been mentioned in this preprint). However, this does not correspond to a wider trend across X-linked genes: these is shown in Figure 35.
mouse_ensembl = useMart("ensembl")
mouse_ensembl = useDataset("mmusculus_gene_ensembl", mart = mouse_ensembl)
gene_map = getBM(attributes=c("ensembl_gene_id", "chromosome_name"), filters = "ensembl_gene_id", values = rownames(late_sce), mart = mouse_ensembl)
xchr = gene_map[gene_map[,2] == "X", 1]
ggplot(mapping = aes(x = hind_de$logFC[rownames(hind_de) %in% xchr])) +
geom_histogram(bins = 20) +
labs(x = "logFC (Hindgut 2/Hindgut 1)", y = "Count")
Figure 35: Fold changes for X-linked genes are shown
There is no systematic deviation of the fold changes from 0.
The expression patterns of these genes are shown in the full dataset in Figure 36.
p_rhox = plot_expr(coord_1 = umap[,1],coord_2 = umap[,2], gene_name = "Rhox5", labels = c("UMAP1", "UMAP2"), sce = sce)
p_trap = plot_expr(coord_1 = umap[,1],coord_2 = umap[,2], gene_name = "Trap1a", labels = c("UMAP1", "UMAP2"), sce = sce)
p_celltype = ggplot(as.data.frame(umap), aes(x = V1, y = V2, col = factor(meta$celltype, levels = names(celltype_colours), ordered = TRUE))) +
geom_point(size = 0.4) +
labs(x = "UMAP1", y = "UMAP2") +
scale_color_manual(values = celltype_colours, name = "Celltypes") +
guides(colour = guide_legend(override.aes = list(size=4)))
plot_grid(plot_grid(p_rhox, p_trap, nrow = 1),
p_celltype,
ncol = 1)
Figure 36: Rhox5 and Trap1a expression across the atlas dataset
Interestingly, two other DE genes were Cdx1 and Cdx4. The expression pattern across the gut is shown in Figure 37.
p1 = gut_plot("Cdx1")
p2 = gut_plot("Cdx4")
plot_grid(p1, p2, ncol =1)
Figure 37: Cdx1 and Cdx4 expression in the gut
Given the strong differential expression of Rhox5 and Trap1a, and their expression in other extraembryonic tissues, how are these genes distributed in the gut? This is shown in Figure 38.
p1 = gut_plot("Rhox5")
p2 = gut_plot("Trap1a")
plot_grid(p1, p2, nrow = 2)
Figure 38: Rhox5 and Trap1a expression in the mature gut cells
What fraction of cells in each of the mature gut clusters express either of these genes? This is shown in Figure 39
rhox = as.numeric(logcounts(late_sce[match("Rhox5", genes[,2]),]))
trap = as.numeric(logcounts(late_sce[match("Trap1a", genes[,2]),]))
either = rhox | trap
tab = table(late_clusts, either)
tab = sweep(tab, 1, rowSums(tab), "/")
rownames(tab) = cluster_labels[match(rownames(tab), names(cluster_labels))]
mlt = melt(tab)
exe_mark = ggplot(mlt, aes(x = late_clusts, y = value, fill = either)) +
geom_bar(stat = "identity") +
scale_fill_manual(labels = c("TRUE" = "Trap1a+ or Rhox5+", "FALSE" = "Trap1a- & Rhox5-"), values= c("TRUE" = "coral", "FALSE" = "seagreen4"), name = "") +
labs(y = "Fraction of cells") +
theme(axis.text.x = element_text(angle = -25, hjust = 0),
axis.title.x = element_blank())
exe_mark
Figure 39: The fraction of cells expressing either Trap1a or Rhox5 is shown for each cluster of the embryonic gut
ggsave(exe_mark,
file = "/nfs/research1/marioni/jonny/embryos/scripts/endoderm/plots/trap_rox_expr.pdf",
width = 6, height = 6)
Assuming that these genes mark visceral endoderm contribution, there appear to be VE-derived cells present in all gut lineages.
To generalise our DE comparisons across all these tissues, we have performed a differential expression analysis between gut cells at the E8.25 and E8.5 timepoints that express either of those genes and cells from the same timepoints and tissues that do not. Additionally, the assigned cluster for each cell (e.g., “Foregut 1”) was included as a blocking factor, to avoid effects of compositional differences, and DE was tested against a logFC threshold of 0.5. This procedure should test for systematic differences in the expression profiles of Rhox5 or Trap1a positive cells compared to the negative cells irrespective of the region of the gut to which they contribute. Interestingly, few other genes appear to differ (Figure 40).
rt = calcAverage(late_sce) > 1
markers = findMarkers(late_sce[rt,], clusters = as.numeric(either), block = late_clusts, full.stats = TRUE, lfc = 0.5)
markers$`0`$mgi = genes[match(rownames(markers$`0`), genes[,1]),2]
de_exe = markers$`0`$stats.1
de_exe$FDR = exp(de_exe$log.FDR)
de_exe$mgi = markers$`0`$mgi
labs = de_exe[de_exe$FDR < 0.1,]
ggplot(as.data.frame(de_exe), aes(x = logFC, y = -log10(exp(log.p.value)), col = FDR < 0.1, size = FDR < 0.1)) +
geom_point() +
scale_colour_manual(values = c("TRUE" = "red", "FALSE" = "black")) +
scale_size_manual(values = c("TRUE" = 1, "FALSE" = 0.2)) +
labs(x = "logFC (Midgut/Foregut 2)", y = "-log10(p)") +
geom_label(data = as.data.frame(labs),
mapping = aes(x = logFC, y= -log10(exp(log.p.value)), label = mgi),
nudge_y = 1.5,
inherit.aes = FALSE) +
theme(legend.position = "none")
Figure 40: Differential expression volcano plot between cells that do or do not express Rhox5 or Trap1a, at E8
25 and E8.5.
Interestingly, the Hindgut 1 cluster was positioned at the most hindward position of our spatial ordering of the gut tube (Figure 41).
gut_density
Figure 41: Density of celltypes along the pseudospatial ordering of gut cells
In order to validate both this suggestion of hindward-ness, and the presence of VE-derived cells concentrated in the hindgut, we dissected a Ttr::Cre lineage traced mouse at E8.5, exposing the embryonic gut. Ttr is very highly expressed in the visceral endoderm, and such cells and their descendants become fluorescently labelled. An image from the dissection is shown in Figure 42. Note that lineage traced cells are visible at a low level across the length of the gut tube, and moreover are very highly concentrated at the most rearward section of the embryo.
knitr::include_graphics(paste0("/nfs/research1/marioni/jonny/embryos/scripts/endoderm/gut_microscopy.png"))
Figure 42: Dissection of a Ttr-Cre lineage traced mouse at E8
5, exposing the embryonic gut. Red fluorescence is phalloidin, green fluorescence is YFP that is expressed in Ttr expressing cells and their progeny.
We have observed that many cells from the visceral endoderm appear to contribute to the hindgut of the developing mouse, which is supported by other linage-tracing experiments. How do these cells mature from their respective origins?
To define these trajectories, we use the masses calculated when we pulled backward through the transport maps. Specifically:
For the visceral endoderm to Hindgut 1 trajectory, we consider cells with largest mass contribution to Hindgut 1. These cells are all classified in the celltype visceral endoderm.
For the definitive endoderm to Hindgut 2 trajectory, we include cells with largest mass contribution to Hindgut 2 which are of the celltypes gut or definitive endoderm (i.e. excluding visceral endoderm). However, these cells do not extend into the definitive endoderm, where we observe roughly similar mass contributions to various terminal celltypes (Figure 32B). We therefore also include cells for which the Hindgut 2 mass contribution is >90% the contribution of the celltype terminus with largest contribution, assuming that these cells represent common progenitors.
The cells allocated to each trajectory are shown in Figure 43
cells_hg1 = wot_backward$id[wot_backward$max.mass == "Hindgut 1"]
cells_hg2 = wot_backward$id[wot_backward$hg2.score > 0.9 &
wot_backward$celltype %in% c("Def. endoderm",
"Gut")]
col = sapply(sub_meta$cell, function(x){
if(x %in% cells_hg1){
return("Hindgut 1")
} else if(x %in% cells_hg2){
return("Hindgut 2")
} else {
return("Neither")
}})
ord = order(col, decreasing = TRUE)
traj_wot = ggplot(mapping = aes(x = graph_df$gephiX[ord], y = graph_df$gephiY[ord], col = col[ord])) +
geom_point(size = 1) +
scale_color_manual(values = c("Hindgut 1" = "coral", "Hindgut 2" = "seagreen4", "Neither" = "darkgrey"), name = "") +
theme(axis.text = element_blank(),
axis.ticks = element_blank()) +
labs(x = "gephiX", y = "gephiY")
traj_wot
Figure 43: Cells included in trajectories from origin cells to hindgut
ggsave(traj_wot,
file = "/nfs/research1/marioni/jonny/embryos/scripts/endoderm/plots/wot_backward_trajectories.pdf",
width = 6, height = 6)
We take these cells and compute diffusion maps using subsetted batch-corrected PCs.
hg1_sce = scater::normalize(sub_sce[, sub_meta$cell %in% cells_hg1])
hg1_meta = sub_meta[sub_meta$cell %in% cells_hg1,]
set.seed(42)
hg1_dmap = DiffusionMap(corrected[match(hg1_meta$cell, rownames(corrected)),])
hg2_sce = scater::normalize(sub_sce[,sub_meta$cell %in% cells_hg2])
hg2_meta = sub_meta[sub_meta$cell %in% cells_hg2,]
set.seed(42)
hg2_dmap = DiffusionMap(corrected[match(hg2_meta$cell, rownames(corrected)),])
Expression of of the genes Ttr and Cdx2 in the cells of the Hindgut 1 trajectory is shown in Figure 44. These mark the visceral endoderm (start) and hindgut (end) of the trajectory - we compute DPT from the cell with maxmum value of DC1 in this plot. The diffusion map was calculated from the Hindgut 1 trajectory subset of the batch-corrected PCA coordinates calculated for the endoderm cells. Also shown is the stage from which cells derived, and the DPT value assigned.
hg1_dpt = DPT(hg1_dmap)[[paste0("DPT", which.max(hg1_dmap$DC1))]]
ro = sample(nrow(hg1_meta), nrow(hg1_meta))
hg1_ttr = plot_expr(hg1_dmap$DC1, hg1_dmap$DC2, "Ttr", sce = hg1_sce)
hg1_cdx = plot_expr(hg1_dmap$DC1, hg1_dmap$DC2, "Cdx2", sce = hg1_sce)
hg1_stage = ggplot(mapping = aes(x = hg1_dmap$DC1[ro], y= hg1_dmap$DC2[ro], col = hg1_meta$stage[ro])) +
geom_point() +
scale_color_manual(values = stage_colours, labels = stage_labels, name = "") +
labs(x = "DC1", y = "DC2")
hg1_dptcol = ggplot(mapping = aes(x = hg1_dmap$DC1[ro], y= hg1_dmap$DC2[ro], col = hg1_dpt[ro])) +
geom_point() +
scale_color_viridis_c(name = "DPT") +
labs(x = "DC1", y = "DC2")
plot_grid(hg1_ttr, hg1_cdx, hg1_stage, hg1_dptcol)
Figure 44: Hindgut 1 trajectory cell summary
The correspondance of DPT to cell timepoints is shown in Figure 45.
hg1_meta$dpt = hg1_dpt
p1 = ggplot(hg1_meta, aes(x = stage, col = stage, y = dpt)) +
geom_jitter(height = 0, width = 0.3) +
scale_color_manual(values = stage_colours, labels = stage_labels) +
scale_x_discrete(labels = stage_labels) +
coord_flip() +
labs(y = "DPT") +
theme(axis.title.y = element_blank(),
legend.position = "none")
p1
Figure 45: DPT correlates with cell capture times
ggsave(p1,
file = "/nfs/research1/marioni/jonny/embryos/scripts/endoderm/plots/hg1_stage_dpt_points.pdf",
width = 4, height = 6)
Similarly for the Hindgut 2 trajectory, a diffusion map embedding with the genes Pou5f1 and Cdx4 in the Hindgut 1 trajectory is shown in Figure 46. These mark the primitive streak (start) and hindgut (end) of the trajectory - we compute DPT from the cell with minimum value of DC1 in this plot. DPT was calculated as for the hindgut 1 trajectory. Also shown is the stage from which cells derived, and the DPT value assigned.
ro = sample(nrow(hg2_meta), nrow(hg2_meta))
hg2_pou = plot_expr(hg2_dmap$DC1, hg2_dmap$DC2, "Pou5f1", sce = hg2_sce)
hg2_cdx = plot_expr(hg2_dmap$DC1, hg2_dmap$DC2, "Cdx4", sce = hg2_sce)
hg2_dpt = DPT(hg2_dmap)[[paste0("DPT", which.min(hg2_dmap$DC1))]]
hg2_stage = ggplot(mapping = aes(x = hg2_dmap$DC1[ro], y= hg2_dmap$DC2[ro], col = hg2_meta$stage[ro])) +
geom_point() +
scale_color_manual(values = stage_colours, labels = stage_labels, name = "") +
labs(x = "DC1", y = "DC2")
hg2_dptcol = ggplot(mapping = aes(x = hg2_dmap$DC1[ro], y= hg2_dmap$DC2[ro], col = hg2_dpt[ro])) +
geom_point() +
scale_color_viridis_c(name = "DPT") +
labs(x = "DC1", y = "DC2")
plot_grid(hg2_pou, hg2_cdx, hg2_stage, hg2_dptcol)
Figure 46: Hindgut 1 trajectory cell summary
The correspondance of DPT to cell timepoints is shown in Figure 47.
hg2_meta$dpt = hg2_dpt
p1 = ggplot(hg2_meta, aes(x = stage, col = stage, y = dpt)) +
geom_jitter(height = 0, width = 0.3) +
scale_colour_manual(values = stage_colours, labels = stage_labels) +
scale_x_discrete(labels = stage_labels) +
coord_flip() +
labs(y = "DPT") +
theme(axis.title.y = element_blank(),
legend.position = "none")
p1
Figure 47: DPT correlates with cell capture times
ggsave(p1,
file = "/nfs/research1/marioni/jonny/embryos/scripts/endoderm/plots/hg2_stage_dpt_points.pdf",
width = 4, height = 6)
We now apply our clustering approach (described above, Spatial gene expression dynamics of the gut tube) to these cells along their pseudotime trajectories.
Summaries of the identified gene clusters are shown in Figure 48 (for the VE to hindgut trajectory) and Figure 49 (for the streak to hindgut trajectory).
hg1_hvgs = getHVGs(hg1_sce)
hg2_hvgs = getHVGs(hg2_sce)
clusters_hg1 = get_gene_clusters(hg1_sce[hg1_hvgs,],
hg1_dpt,
seed = 42,
predict_even = TRUE,
normalise = TRUE,
model_mode = "loess")
## ..cutHeight not given, setting it to 0.836 ===> 99% of the (truncated) height range in dendro.
## ..done.
clusters_hg2 = get_gene_clusters(hg2_sce[hg2_hvgs,],
hg2_dpt,
seed = 42,
predict_even = TRUE,
normalise = TRUE,
model_mode = "loess")
## ..cutHeight not given, setting it to 0.847 ===> 99% of the (truncated) height range in dendro.
## ..done.
hg1_plot = plot_cluster_averages(clusters_hg1, predict_even = TRUE)
hg1_plot
Figure 48: Summaries of the detected gene clusters for the hindgut 1 trajectory are shown
The black line indicated the mean value across genes for every observed DC1 coordinate; grey shading indicates the standard deviation at each value.
ggsave(hg1_plot,
file = "/nfs/research1/marioni/jonny/embryos/scripts/endoderm/plots/hg1_clusters.pdf",
width = 8, height = 8)
hg2_plot = plot_cluster_averages(clusters_hg2, predict_even = TRUE)
hg2_plot
Figure 49: Summaries of the detected gene clusters for the hindgut 2 trajectory are shown
The black line indicated the mean value across genes for every observed DC1 coordinate; grey shading indicates the standard deviation at each value.
ggsave(hg2_plot,
file = "/nfs/research1/marioni/jonny/embryos/scripts/endoderm/plots/hg2_clusters.pdf",
width = 8, height = 8)
clusterdf = data.frame(trajectory = c(rep("VE-HG1", length(clusters_hg1$clusters)),
rep("DE-HG2", length(clusters_hg2$clusters))),
ensembl = c(names(clusters_hg1$clusters),
names(clusters_hg2$clusters)),
cluster = c(clusters_hg1$clusters,
clusters_hg2$clusters))
clusterdf$mgi = genes[match(clusterdf$ensembl, genes[,1]),2]
write.table(clusterdf, file = "/nfs/research1/marioni/jonny/embryos/scripts/endoderm/gene_clustering_trajectories.csv", sep = ",", col.names = TRUE, row.names = FALSE, quote = FALSE)
To confirm that the trajectories that we have identified make sense, we incpect the expression patterns of several individual genes. First, in Figure 50, we show the expression of Rhox5 and Trap1a along the two trajectories described above. Particularly, note that there remains low to no expression of these genes along the entire length of the Hindgut 2 trajectory, suggesting that at no point does it contain a large number of extraembryonic-derived cells.
expr_rhox_1 = as.numeric(logcounts(sub_sce[match("Rhox5", genes[,2]), sub_meta$cell %in% cells_hg1]))
p1 = ggplot(mapping = aes(x = clusters_hg1$dpt, y = expr_rhox_1)) +
geom_point() +
geom_smooth(method = "lm", formula = y ~ x, col = "darkgrey") +
labs(x = "DPT", y = "Rhox5 expression") +
ggtitle("Hindgut 1 trajectory") +
lims(y = c(0,6))
expr_rhox_2 = as.numeric(logcounts(sub_sce[match("Rhox5", genes[,2]), sub_meta$cell %in% cells_hg2]))
p2 = ggplot(mapping = aes(x = clusters_hg2$dpt, y = expr_rhox_2)) +
geom_point() +
geom_smooth(method = "lm", formula = y ~ x, col = "darkgrey") +
labs(x = "DPT", y = "Rhox5 expression") +
ggtitle("Hindgut 2 trajectory") +
lims(y = c(0,6))
expr_trap_1 = as.numeric(logcounts(sub_sce[match("Trap1a", genes[,2]), sub_meta$cell %in% cells_hg1]))
p3 = ggplot(mapping = aes(x = clusters_hg1$dpt, y = expr_trap_1)) +
geom_point() +
geom_smooth(method = "lm", formula = y ~ x, col = "darkgrey") +
labs(x = "DPT", y = "Trap1a expression") +
ggtitle("Hindgut 1 trajectory") +
lims(y = c(0,6))
expr_trap_2 = as.numeric(logcounts(sub_sce[match("Trap1a", genes[,2]), sub_meta$cell %in% cells_hg2]))
p4 = ggplot(mapping = aes(x = clusters_hg2$dpt, y = expr_trap_2)) +
geom_point() +
geom_smooth(method = "lm", formula = y ~ x, col = "darkgrey") +
labs(x = "DPT", y = "Trap1a expression") +
ggtitle("Hindgut 2 trajectory") +
lims(y = c(0,6))
plot_grid(p1, p2, p3, p4)
Figure 50: Trap1a and Rhox5 expression is high along the Hindgut 1 trajectory, while remaining low across the Hindgut 2 trajectory
We also show the expression of Y-chromosomal genes in Figure 51 to show that trajectories are not driven by sex differences.
ygenes = gene_map$ensembl_gene_id[gene_map$chromosome_name == "Y"]
yexpr = counts(sub_sce[ygenes, sub_meta$cell %in% cells_hg1])
ytot1 = Matrix::colSums(yexpr) / sizeFactors(sub_sce[,sub_meta$cell %in% cells_hg1])
ytot1 = log2(ytot1 + 1)
p1 = ggplot(mapping = aes(x = clusters_hg1$dpt, y = ytot1)) +
geom_point() +
geom_smooth(method = "lm", formula = y ~ x, col = "darkgrey") +
labs(x = "DPT", y = "YCHR log counts") +
ggtitle("Hindgut 1 trajectory") +
lims(y = c(0,4))
yexpr = counts(sub_sce[ygenes, sub_meta$cell %in% cells_hg2])
ytot2 = Matrix::colSums(yexpr) / sizeFactors(sub_sce[,sub_meta$cell %in% cells_hg2])
ytot2 = log2(ytot2 + 1)
p2 = ggplot(mapping = aes(x = clusters_hg2$dpt, y = ytot2)) +
geom_point() +
geom_smooth(method = "lm", formula = y ~ x, col = "darkgrey") +
labs(x = "DPT", y = "YCHR log counts") +
ggtitle("Hindgut 2 trajectory") +
lims(y = c(0,4))
plot_grid(p1, p2)
Figure 51: There is no notable difference in Y-chromosome gene expression between the two trajectories
We calculate GO enrichments for the genes in these clusters using limma::goana. The background gene list is the set of expressed genes (mean normalised count >0.1) in each of the Hindgut 1 and Hindgut 2 trajectories.
ca = calcAverage(hg1_sce)
hg1_genes = names(ca)[ca > 0.1]
enrich_hg1 = lapply(unique(clusters_hg1$clusters), function(x){
enrich = enrich_genes_goana(hits = names(clusters_hg1$clusters)[clusters_hg1$clusters == x],
# universe = names(clusters_hg1$clusters),
universe = hg1_genes,
warn_missing = FALSE)$result
enrich= enrich[enrich$Ont == "BP", ]
enrich$FDR = p.adjust(enrich$P.DE, method = "fdr")
enrich$cluster = x
return(enrich)
})
enrich_hg1 = do.call(rbind, enrich_hg1)
ca = calcAverage(hg2_sce)
hg2_genes = names(ca)[ca > 0.1]
enrich_hg2 = lapply(unique(clusters_hg2$clusters), function(x){
enrich = enrich_genes_goana(hits = names(clusters_hg2$clusters)[clusters_hg2$clusters == x],
# universe = names(clusters_hg2$clusters),
universe = hg2_genes,
warn_missing = FALSE)$result
enrich= enrich[enrich$Ont == "BP", ]
enrich$FDR = p.adjust(enrich$P.DE, method = "fdr")
enrich$cluster = x
return(enrich)
})
enrich_hg2 = do.call(rbind, enrich_hg2)
write.table(enrich_hg1, file = "/nfs/research1/marioni/jonny/embryos/scripts/endoderm/hg1_enrichments.csv",
sep = ",", quote = FALSE, col.names = TRUE, row.names = TRUE)
write.table(enrich_hg2, file = "/nfs/research1/marioni/jonny/embryos/scripts/endoderm/hg2_enrichments.csv",
sep = ",", quote = FALSE, col.names = TRUE, row.names = TRUE)
sessionInfo()
## R version 3.5.0 (2018-04-23)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.1 LTS
##
## Matrix products: default
## BLAS: /usr/local/lib/R/lib/libRblas.so
## LAPACK: /usr/local/lib/R/lib/libRlapack.so
##
## locale:
## [1] C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] org.Mm.eg.db_3.6.0 AnnotationDbi_1.43.1
## [3] limma_3.37.7 dynamicTreeCut_1.63-1
## [5] cowplot_0.9.3 biomaRt_2.37.6
## [7] pheatmap_1.0.10 RColorBrewer_1.1-2
## [9] knitr_1.20 FateID_0.1.4
## [11] scater_1.9.20 viridis_0.5.1
## [13] viridisLite_0.3.0 destiny_2.11.3
## [15] ggrepel_0.8.0 ggplot2_3.0.0
## [17] scales_1.0.0 reshape2_1.4.3
## [19] igraph_1.2.2 irlba_2.3.2
## [21] Rtsne_0.13 scran_1.9.29
## [23] SingleCellExperiment_1.3.10 SummarizedExperiment_1.11.6
## [25] DelayedArray_0.7.45 matrixStats_0.54.0
## [27] Biobase_2.41.2 GenomicRanges_1.33.14
## [29] GenomeInfoDb_1.17.2 IRanges_2.15.18
## [31] S4Vectors_0.19.19 BiocGenerics_0.27.1
## [33] BiocParallel_1.15.12 Matrix_1.2-14
## [35] BiocStyle_2.9.6 rmarkdown_1.10
##
## loaded via a namespace (and not attached):
## [1] readxl_1.1.0 backports_1.1.2
## [3] RcppEigen_0.3.3.4.0 plyr_1.8.4
## [5] lazyeval_0.2.1 sp_1.3-1
## [7] crosstalk_1.0.0 kmknn_0.99.16
## [9] lle_1.1 digest_0.6.17
## [11] htmltools_0.3.6 GO.db_3.6.0
## [13] memoise_1.1.0 magrittr_1.5
## [15] openxlsx_4.1.0 xts_0.11-1
## [17] prettyunits_1.0.2 princurve_2.1.2.1
## [19] colorspace_1.3-2 blob_1.1.1
## [21] haven_1.1.2 xfun_0.3
## [23] dplyr_0.7.6 crayon_1.3.4
## [25] RCurl_1.95-4.11 jsonlite_1.5
## [27] bindr_0.1.1 zoo_1.8-4
## [29] glue_1.3.0 gtable_0.2.0
## [31] zlibbioc_1.27.0 XVector_0.21.3
## [33] webshot_0.5.1 car_3.0-2
## [35] Rhdf5lib_1.3.3 DEoptimR_1.0-8
## [37] HDF5Array_1.9.19 abind_1.4-5
## [39] VIM_4.7.0 DBI_1.0.0
## [41] edgeR_3.23.5 som_0.3-5.1
## [43] ggthemes_4.0.1 miniUI_0.1.1.1
## [45] Rcpp_0.12.18 progress_1.2.0
## [47] xtable_1.8-3 laeken_0.4.6
## [49] bit_1.1-14 foreign_0.8-70
## [51] proxy_0.4-22 vcd_1.4-4
## [53] httr_1.3.1 htmlwidgets_1.2
## [55] pkgconfig_2.0.2 XML_3.98-1.16
## [57] nnet_7.3-12 locfit_1.5-9.1
## [59] labeling_0.3 tidyselect_0.2.4
## [61] rlang_0.2.2 manipulateWidget_0.10.0
## [63] later_0.7.5 munsell_0.5.0
## [65] cellranger_1.1.0 tools_3.5.0
## [67] RSQLite_2.1.1 evaluate_0.11
## [69] stringr_1.3.1 yaml_2.2.0
## [71] bit64_0.9-7 zip_1.0.0
## [73] robustbase_0.93-3 rgl_0.99.16
## [75] purrr_0.2.5 randomForest_4.6-14
## [77] bindrcpp_0.2.2 mime_0.5
## [79] compiler_3.5.0 beeswarm_0.2.3
## [81] curl_3.2 e1071_1.7-0
## [83] smoother_1.1 tibble_1.4.2
## [85] statmod_1.4.30 stringi_1.2.4
## [87] highr_0.7 forcats_0.3.0
## [89] lattice_0.20-35 pillar_1.3.0
## [91] BiocManager_1.30.2 lmtest_0.9-36
## [93] data.table_1.11.8 bitops_1.0-6
## [95] httpuv_1.4.5 R6_2.2.2
## [97] bookdown_0.7 promises_1.0.1
## [99] gridExtra_2.3 rio_0.5.10
## [101] vipor_0.4.5 boot_1.3-20
## [103] MASS_7.3-49 assertthat_0.2.0
## [105] rhdf5_2.25.11 rprojroot_1.3-2
## [107] withr_2.1.2 GenomeInfoDbData_1.1.0
## [109] hms_0.4.2 grid_3.5.0
## [111] class_7.3-14 DelayedMatrixStats_1.3.9
## [113] carData_3.0-1 TTR_0.23-4
## [115] scatterplot3d_0.3-41 shiny_1.1.0
## [117] snowfall_1.84-6.1 ggbeeswarm_0.6.0